import datetime

import scrapy

from scrapy_zhaohaofang.items import ScrapyZhaohaofangItem
from scrapy_zhaohaofang.spiders.utils.item_util import str_compress_blank
from scrapy_zhaohaofang.spiders.utils.wrapper_uitl import item_exception_wrapper
from scrapy_zhaohaofang.spiders.utils.zhaohaofang import ZhfBaseSpider


class QiangWei58Spider(ZhfBaseSpider):
    name = "58brand"
    allowed_domains = ['jn.58.com']
    base_url = "http://jn.58.com"

    def start_requests(self):
        self.logger.info('爬虫开始，进入首页')
        return [self.begin_crawl(self.parse_list_page)]

    def begin_crawl(self, callback):
        url_first_page = 'http://jn.58.com/pinpaigongyu/pn/1/'
        return scrapy.Request(url_first_page,
                              headers={
                                  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                  'Accept-Language': 'en',
                                  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) '
                                                'AppleWebKit/537.36 (KHTML, like Gecko) '
                                                'Chrome/65.0.3325.146 Safari/537.36',
                                  'Host': 'jn.58.com'
                              },
                              callback=callback,
                              dont_filter=True)

    def parse_list_page(self, response):
        detail__page_links = response.css('ul.list > li > a::attr(href)').extract()

        # 遍历所有的链接;
        for detail__page_link in detail__page_links:
            # 提取出图片的页面链接
            url = "".join([self.base_url, detail__page_link])
            # 接收一个url,返回一个用于爬取的Request对象
            yield scrapy.Request(url,
                                 headers={
                                     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                     'Accept-Language': 'en',
                                     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) '
                                                   'AppleWebKit/537.36 (KHTML, like Gecko) '
                                                   'Chrome/65.0.3325.146 Safari/537.36',
                                     'Host': 'jn.58.com'
                                 },
                                 callback=self.parse_detail_page,
                                 dont_filter=True)

        # 获得下一页的链接
        url_next_page_str = response.css('a.next::attr(href)').extract_first()
        next_page_num = int(url_next_page_str.split("pn/")[1][:-1])
        if next_page_num > self.max_page_num:
            self.logger.info("已经超过最大的爬取页数,下一页是:%s ." % next_page_num)
            pass
        else:
            url_next_page = "".join([self.base_url, url_next_page_str])
            self.logger.info("将要爬取下一页,下一页是:%s ." % next_page_num)
            yield scrapy.Request(
                url_next_page,
                headers={
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language': 'en',
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) '
                                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                                  'Chrome/65.0.3325.146 Safari/537.36',
                    'Host': 'jn.58.com'
                },
                callback=self.parse_list_page,
                dont_filter=True)

    @item_exception_wrapper
    def parse_detail_page(self, response):
        page_url = response.url
        brand_name = response.css('span.name::text').extract_first()
        if brand_name in self.brand_crm_dict:
            publish_time_str = response.css('span.tips::text').extract_first()
            if publish_time_str and "更新时间：" in publish_time_str:
                publish_time_str = publish_time_str.split("：")[1].strip()
                publish_time = datetime.datetime.strptime(publish_time_str, '%Y-%m-%d')
                if publish_time > self.begin_date:
                    item = ScrapyZhaohaofangItem()
                    item["bid"] = self.brand_crm_dict[brand_name]["bid"]
                    item["cntr"] = self.brand_crm_dict[brand_name]["cntr"]
                    item["cntrt"] = self.brand_crm_dict[brand_name]["cntrt"]
                    item["cntrm"] = self.brand_crm_dict[brand_name]["cntrm"]
                    item["prt_name"] = self.brand_crm_dict[brand_name]["prt_name"]
                    item["ttl"] = response.css('div.housedetail > h2::text').extract_first()
                    item["mny"] = response.css('div.detail_header > span.price::text').extract_first()
                    item["kwd"] = " ".join(response.css('ul.tags-list li::text').extract())
                    item["dpst"] = 0
                    item["pay"] = 1
                    item["ctm"] = publish_time_str
                    item["gdr"] = 0
                    url_list_orign = response.css("ul.fr > li > img::attr(src)").extract()
                    url_list_new = []
                    for url in url_list_orign:
                        url_list_new.append("".join(["http:", url]))
                    item["pic_urls"] = url_list_new

                    item["area"] = response.css("ul.house-info-list > li:nth-child(1) > span::text").extract_first().split(" ")[0]
                    item["qrt_addr"] = response.css("ul.house-info-list > li:nth-child(4) > span::text").extract_first()

                    title_array = item["ttl"].split(" ")
                    item["qrt_name"] = title_array[1]
                    if "(" in item["qrt_name"]:
                        item["qrt_name"] = item["qrt_name"][:item["qrt_name"].index("(")]
                    if title_array[0][1:3] == "整租":
                        item["rom"] = 0
                    else:
                        item["rom"] = self.room_type_dict[title_array[2][-2:]]
                    item["cbd"] = title_array[0][4:]
                    house_layout_array = str_compress_blank(
                        response.css("ul.house-info-list > li:nth-child(2) > span::text").extract_first()).split(" ")
                    item["name"] = "".join([item["qrt_name"], house_layout_array[0]])
                    house_layout_str = house_layout_array[0]
                    item["room"] = house_layout_str[0]
                    item["hal"] = 0 if "厅" not in house_layout_str else house_layout_str[2]
                    item["tlt"] = 0 if "卫" not in house_layout_str else house_layout_str[-2]
                    face_str = house_layout_array[1][-1]
                    item["ornt"] = 1 if face_str not in self.face_dict else self.face_dict[face_str]
                    floor_str_array = str_compress_blank(
                        response.css("ul.house-info-list > li:nth-child(3) > span::text").extract_first()).split(" ")
                    if "/" in floor_str_array[0]:
                        floor_array = floor_str_array[0].split("/")
                        item["flr"] = floor_array[0]
                        item["flrs"] = floor_array[1]
                    else:
                        item["flr"] = 0
                        item["flrs"] = 0
                        self.logger.info("解析楼层错误,当前抓取的内容是: %s ,页面路径是: %s , " % (floor_str_array, page_url))
                    # 不知道为什么这个数是必须要有的,只要有就正常 没有就不正常
                    item["pic_pid"] = 0
                    yield item
                else:
                    self.logger.info("发布时间为:%s , 较旧, 舍弃,当前页面是: %s" % (publish_time, page_url))
                    pass
            else:
                self.logger.info("发布时间为: %s,解析规则无法解析,页面路径是:%s." % (publish_time_str, page_url))
                pass
        else:
            self.logger.info("当前抓取到的公寓品牌为 %s ,不在抓取范围内,当前页面地址是: %s." % (brand_name, page_url))
            pass

    def closed(self, reason=None):
        self.logger.info("爬取结束了, 结束原因是: %s " % reason)
